In [1]:
    
from pymongo import MongoClient
from datetime import datetime, date, timedelta, time
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap
import pymysql
matplotlib.style.use('ggplot')
    
In [2]:
    
def toUnix(datetime):
    unix = datetime.strftime('%s')
    return unix
def convertDatetime(unix):
    dt = datetime.fromtimestamp(unix)
    return dt.strftime('%Y-%m-%d %H:%M:%S')
def convertDate(unix):
    d = date.fromtimestamp(unix)
    return d.strftime('%Y-%m-%d')
    
In [3]:
    
class RailDatabase():
    def __init__(self, isNew):
        if isNew:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='newrailDB')
        else:
            self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='oldrailDB')
    def getAllRoutes(self, date):
        C = self.conn.cursor()
        C.execute('SELECT route_id FROM route WHERE date = %s', (date,))
        rows = C.fetchall()
        C.close()
        return rows
    def getStops(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s', (routeID,))
        rows = C.fetchall()
        C.close()
        return rows
    def getLastStop(self, routeID):
        C = self.conn.cursor()
        C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
        rows = C.fetchall()
        last = rows[-1:]
        C.close()
        return last
    def getAllStations(self):
        C = self.conn.cursor()
        C.execute('SELECT * FROM station')
        rows = C.fetchall()
        C.close()
        return rows
    
class TweetDatabase():
    def __init__(self, isNew):
        self.conn = MongoClient().data_science
        self.new = isNew
        
    def connect(self):
        if self.new:
            return self.conn.new_tweets
        else:
            return self.conn.old_tweets
    
class Config():
    def __init__(self, isNew):
        self.new = isNew
        
    ##EQUAL PERIOD BEFORE AND AFTER NEW SCHEDULE##
    def period(self):
        if self.new:
            return (date(2014, 10, 27), date(2015, 2, 1))
        else:
            return (date(2014, 10, 27), date(2014, 12, 14))
    
In [4]:
    
def tweetsPerDay(isNew):
    conf = Config(isNew)
    
    start, stop = conf.period()
    period = pd.date_range(start, stop)
    #Tweets Per Day
    tpd = pd.Series(0, index = period)
    db = TweetDatabase(False).connect()
    oldTweets = db.find()
    for tweet in oldTweets:
        create_datetime = tweet['created_at']
        create_datetime = parser.parse(create_datetime, ignoretz = True)
        create_date = create_datetime.date()
        d = create_date.isoformat()
        tpd[d] += 1
    db = TweetDatabase(True).connect()
    newTweets = db.find()
    for tweet in newTweets:
        create_datetime = tweet['created_at']
        create_datetime = parser.parse(create_datetime, ignoretz = True)
        create_date = create_datetime.date()
        if create_date <= stop:
            d = create_date.isoformat()
            tpd[d] += 1
    return tpd
    
In [6]:
    
tpd = tweetsPerDay(True)
fig = plt.figure()
plt.bar(np.arange(tpd.size), tpd, color='b')
plt.axvline(49, color='r')
plt.title('Tweets per day')
plt.xlabel('Date')
plt.ylabel('Amount of Tweets')
plt.ylim(ymax=1800)
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_tweets.pdf')
plt.show()
    
In [9]:
    
def delaysPerDay(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    limit = date(2014, 12, 14)
    diff = stop - start
    period = pd.date_range(start, stop)
    zeros = np.zeros((diff.days+1, 2))
    #DataFrame with worst-case and avg delay on entire network, per day
    #Delay is computed in a worst-case scenario, i.e. max delay of a train
    #divided by the amount of trains on that day.
    #In avg-case scenario, i.e. avg of avg of arrival and avg of departure delay
    #divided by the amount of trains on that day.
    delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])
    while(start <= stop):
        n = start > limit
        db = RailDatabase(n)
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)
        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))
        #Instead of keeping a list of worst_delays {which is O(n) in memory}, 
        #the worst_delay is accumulated and divided by the amount of data points {which is O(1) in memory}
        total_worst_delay = 0
        worst_delay_count = 0
        total_avg_delay = 0
        avg_delay_count = 0
        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)
            max_delay = 0
            total_arrival_delay = 0
            arrival_delay_count = 0
            total_departure_delay = 0
            departure_delay_count = 0
            for stopRow in stops:
                arrival_detected = stopRow[3]
                departure_detected = stopRow[6]
                arrival_delay = stopRow[2]
                departure_delay = stopRow[5]
                if arrival_detected:
                    total_arrival_delay += arrival_delay
                    arrival_delay_count += 1
                if departure_detected:
                    total_departure_delay += departure_delay
                    departure_delay_count += 1
                if max(arrival_delay, departure_delay) > max_delay:
                    max_delay = max(arrival_delay, departure_delay)
            total_worst_delay += max_delay
            worst_delay_count += 1
            if arrival_delay_count == 0:
                arrival_delay_count = 1
            if departure_delay_count == 0:
                departure_delay_count = 1
            avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
            avg_departure = float(total_departure_delay) / float(departure_delay_count)
            total_avg_delay += np.mean([avg_arrival, avg_departure])
            avg_delay_count += 1
        key = start.isoformat()
        if worst_delay_count == 0:
            worst_delay_count = 1
        if avg_delay_count == 0:
            avg_delay_count = 1
        delays['Worst case'][key] = float(total_worst_delay) / float(worst_delay_count)
        delays['Avg case'][key] = float(total_avg_delay) / float(avg_delay_count)
        delta = timedelta(days=1)
        start = start + delta
    return delays
    
In [10]:
    
delays = delaysPerDay(True)
rolling = pd.rolling_mean(delays, 4, center=True)
ax_delays = delays.plot(x_compat=True, style='--', color=['r', 'b'])
rolling.plot(color=['r','b'], ax=ax_delays, legend=0)
plt.axvline('2014-12-14', color='g')
plt.title('Delays per day on entire network')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.savefig('./../../Paper/plots/equal_period_delays.png')
    
In [8]:
    
def cancelsPerDay(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    limit = date(2014, 12, 14)
    period = pd.date_range(start, stop)
    #Series with the amount of cancelled trains, per day
    #A train is considered cancelled when arrival is not detected at its final stop
    cancels = pd.Series(0, index = period)
    while(start <= stop):
        n = start > limit
        db = RailDatabase(n)
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)
        key = start.isoformat()
        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))
        for routeRow in routes:
            routeID = routeRow[0]
            lastStop = db.getLastStop(routeID)
            for stopRow in lastStop:
                arrival_detected = stopRow[3]
                if not arrival_detected:
                    cancels[key] += 1
        delta = timedelta(days=1)
        start = start + delta
    return cancels
    
In [9]:
    
cancels = cancelsPerDay(True)
fig = plt.figure()
plt.bar(np.arange(98), cancels, color='b')
plt.axvline(49, color='r')
plt.title('Cancelled trains per day')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_cancels.pdf')
plt.show()
    
In [6]:
    
def percentageDelays(isNew):
    conf = Config(isNew)
    start, stop = conf.period()
    diff = stop - start
    period = pd.date_range(start, stop)
    zeros = np.zeros((diff.days+1, 2))
    limit = date(2014, 12, 14)
    #DataFrame with the number of delayed trains per day
    #A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
    #The 2nd column is the percentage of delayed trains on that day.
    perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])
    while(start <= stop):
        n = start > limit
        db = RailDatabase(n)
        
        t = time(0, 0, 0)
        dt = datetime.combine(start, t)
        #Get all routes for a specified date.
        #Every trainID rides only once each day, so there is no point in asking all the trainIDs first
        routes = db.getAllRoutes(toUnix(dt))
        key = start.isoformat()
        count = len(routes)
        #Get all stops for the route
        for routeRow in routes:
            routeID = routeRow[0]
            stops = db.getStops(routeID)
            #If a stop is found in the route, with > 5min delay
            #then this train is considered delayed!
            for stopRow in stops:
                arrival_detected = stopRow[3]
                departure_detected = stopRow[6]
                arrival_delay = stopRow[2]
                departure_delay = stopRow[5]
                if arrival_detected:
                    if arrival_delay > 5:
                        perctDelays['Delayed'][key] += 1
                        break
                elif departure_detected:
                    if departure_delay > 5:
                        perctDelays['Delayed'][key] += 1
                        break
        perctDelays['Percentage'][key] = (float(perctDelays['Delayed'][key]) / float(count)) * float(100)
        delta = timedelta(days=1)
        start = start + delta
    return perctDelays
    
In [7]:
    
perctDelays = percentageDelays(True)
fig = plt.figure()
plt.bar(perctDelays.index, perctDelays['Percentage'], color='b')
plt.axvline('2014-12-15', color='r')
plt.title('Percentage of delayed trains per day')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.ylim(ymax=60)
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_percentage_delays.pdf')
plt.show()
    
In [ ]: